regression in depth, changing m value

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
setwd("C:/Users/Administrator/Desktop/Machine Learning/DATA SETS")
advertising <- read.csv("C:/Users/Administrator/Desktop/Machine Learning/DATA SETS/Advertising.csv")
adv_train <- advertising[sample(seq(1,nrow(advertising)),160),]
adv_testing <- advertising[sample(seq(1,nrow(advertising)),40),]

advertising_model <- lm(sales ~ TV  , data = adv_train)


m = 0.09
c = 1

sales_predicted = m * adv_train$TV + c
error = sum((adv_train$sales - sales_predicted) ^ 2) / nrow(adv_train)
error
## [1] 23.84551
{{plot(adv_train$TV,adv_train$sales)
  lines(adv_train$TV,sales_predicted)}}

m = seq(0,1,length.out = 100) #  or use seq.int()
m
##   [1] 0.00000000 0.01010101 0.02020202 0.03030303 0.04040404 0.05050505
##   [7] 0.06060606 0.07070707 0.08080808 0.09090909 0.10101010 0.11111111
##  [13] 0.12121212 0.13131313 0.14141414 0.15151515 0.16161616 0.17171717
##  [19] 0.18181818 0.19191919 0.20202020 0.21212121 0.22222222 0.23232323
##  [25] 0.24242424 0.25252525 0.26262626 0.27272727 0.28282828 0.29292929
##  [31] 0.30303030 0.31313131 0.32323232 0.33333333 0.34343434 0.35353535
##  [37] 0.36363636 0.37373737 0.38383838 0.39393939 0.40404040 0.41414141
##  [43] 0.42424242 0.43434343 0.44444444 0.45454545 0.46464646 0.47474747
##  [49] 0.48484848 0.49494949 0.50505051 0.51515152 0.52525253 0.53535354
##  [55] 0.54545455 0.55555556 0.56565657 0.57575758 0.58585859 0.59595960
##  [61] 0.60606061 0.61616162 0.62626263 0.63636364 0.64646465 0.65656566
##  [67] 0.66666667 0.67676768 0.68686869 0.69696970 0.70707071 0.71717172
##  [73] 0.72727273 0.73737374 0.74747475 0.75757576 0.76767677 0.77777778
##  [79] 0.78787879 0.79797980 0.80808081 0.81818182 0.82828283 0.83838384
##  [85] 0.84848485 0.85858586 0.86868687 0.87878788 0.88888889 0.89898990
##  [91] 0.90909091 0.91919192 0.92929293 0.93939394 0.94949495 0.95959596
##  [97] 0.96969697 0.97979798 0.98989899 1.00000000
length(m)
## [1] 100
# To find error values for 100 m values
for(i in m){
  sales_predicted = i * adv_train$TV + c
  error = sum((adv_train$sales - sales_predicted) ^ 2) / nrow(adv_train)
  #print(error)
  {{plot(adv_train$TV,adv_train$sales)
  lines(adv_train$TV,sales_predicted)}}

}

m = seq(-1,1,length.out = 100) #  or use seq.int()

e = c()

# To find error values for 100 m values
for(i in m){
  sales_predicted = i * adv_train$TV + c
  error = sum((adv_train$sales - sales_predicted) ^ 2) / nrow(adv_train)
  #print(error)
  e = c(e,error)
}
plot(e)

min(e)
## [1] 21.99689
which(e==min(e))
## [1] 54
m[54]
## [1] 0.07070707

changing c value

# if you take length.out as 1000 then the value will be still nearer to the lm command's co-efficient
# in the previous section
# increase the length.out value and you get the value closer to the lm commands value

m=seq(-1,1,length.out = 100)
c=seq(-10,10,length.out = 100)

# reason for 2 for loops is for different combinations of m , changing the value of c

m_rep=c()
c_rep=c()
e=c()

for(i in m){
  for(j in c){
  # print(c(i,j))
    sales_predict <- i * adv_train$TV + j
    error = sum((adv_train$sales - sales_predict) ^ 2) / nrow(adv_train)
   m_rep = c(m_rep, i) 
   c_rep = c(c_rep, j)
   e = c(e,error)
  }
}
models = data.frame(slope=m_rep,intercept=c_rep,mse=e)
dim(models)
## [1] 10000     3
models %>% arrange(mse) %>% head(1)
## Warning: package 'bindrcpp' was built under R version 3.4.3
##        slope intercept      mse
## 1 0.05050505  6.767677 11.17694
plot(e)

# 3D graph of above result

library(plotly)
mspace = m
cspace = c
zspace = matrix(e,nrow=length(m),ncol=length(c))

plot_ly(x=mspace,y=cspace,z=zspace) %>% add_surface()
m1 = seq(-1,1,length.out = 10)
m2 = seq(-1,1,length.out = 10)
c = seq(-10,10,length.out = 10)

m1_rep=c()
m2_rep=c()
c_rep=c()
e=c()

for(i in m1){
  for(j in m2){
    for(k in c){
      sales_predicted = i * adv_train$TV + j * adv_train$radio + k
      error = sum((adv_train$sales - sales_predicted) ^ 2)/nrow(adv_train)
      m1_rep = c(m1_rep,i)
      m2_rep = c(m2_rep,j)
      c_rep = c(c_rep, k)
      e = c(e,error)
    }
  }
}

models1 = data.frame(slope=c(m1_rep,m2_rep),intercept=c_rep,mse=e)
dim(models)
## [1] 10000     3
models %>% arrange(mse) %>% head(1)
##        slope intercept      mse
## 1 0.05050505  6.767677 11.17694

Calculating the value of m (using the formula) which is closer to what lm command is given

Gradient descent

x = rnorm(100)
y = 0.05 * x
df_xy = data.frame(x=x,y=y)
plot(x,y)

cor(x,y) # measure of relatedness
## [1] 1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#df_xy = df_xy %>% mutate(xy=x *y)
#df_xy = df_xy %>% mutate(mx_square = m * (x ^2))

m = 1000
alpha = 0.01
n_iterations = 1000

# using formula 
for(i in seq(1 ,n_iterations)){
  df_xy = df_xy %>% mutate(xy= x * y)
  df_xy =df_xy %>% mutate(mx_square = m * (x ^2))
  df_xy = df_xy %>% mutate(xy_mx2 = xy - mx_square)
  sigma_xy_mx2 = sum(df_xy$xy_mx2)
  m_gradient = -2 / length(x) * sigma_xy_mx2
  m = m - alpha * m_gradient
}
print(m)
## [1] 0.0500006
advertising_model <- lm(sales ~ TV  , data = adv_train)



# calculating the each error

m = 1000
alpha = 0.01
n_iterations = 1000
errors=c()

for(i in seq(1 ,n_iterations)){
  curr_err = sum((y -  (m * x)) ^ 2)/ length(x)
  errors = c(errors,curr_err)
  df_xy = df_xy %>% mutate(xy=x * y)
  df_xy =df_xy = df_xy %>% mutate(mx_square = m * (x ^2))
  df_xy = df_xy %>% mutate(xy_mx2 = xy - mx_square)
  sigma_xy_mx2 = sum(df_xy$xy_mx2)
  m_gradient = -2 / length(x) * sigma_xy_mx2
  m = m - alpha * m_gradient
}
print(m)
## [1] 0.0500006
plot(errors)

# for alpha value 0.2,it takes less iteration to arive at the global minima .
# If alpha value is still increased it may exceed and overshoot between the global minima.
# ideally alpha value must be less in order to arrive at the global minima

m = 1000
alpha = 0.2
n_iterations = 1000 # more iterations technically good but computationally slow
errors=c()

for(i in seq(1 ,n_iterations)){
  curr_err = sum((y -  (m * x)) ^ 2)/ length(x)
  errors = c(errors,curr_err)
  df_xy = df_xy %>% mutate(xy=x *y)
  df_xy =df_xy = df_xy %>% mutate(mx_square = m * (x ^2))
  df_xy = df_xy %>% mutate(xy_mx2 = xy - mx_square)
  sigma_xy_mx2 = sum(df_xy$xy_mx2)
  m_gradient = -2 / length(x) * sigma_xy_mx2
  m = m - alpha * m_gradient
}
print(m)
## [1] 0.05
plot(errors)

m = 1000
alpha = 0.01
n_iterations = 1000 # more iterations technically good but computationally slow
errors=c()
m_vals = c()

# tracing m values
for(i in seq(1 ,n_iterations)){
  m_vals = c(m_vals,m)
  curr_err = sum((y -  (m * x)) ^ 2)/ length(x)
  errors = c(errors,curr_err)
  df_xy = df_xy %>% mutate(xy=x *y)
  df_xy =df_xy = df_xy %>% mutate(mx_square = m * (x ^2))
  df_xy = df_xy %>% mutate(xy_mx2 = xy - mx_square)
  sigma_xy_mx2 = sum(df_xy$xy_mx2)
  m_gradient = -2 / length(x) * sigma_xy_mx2
  m = m - alpha * m_gradient
}
print(m)
## [1] 0.0500006
{{plot(m_vals,errors)
  lines(m_vals,errors)}} 

# for alpha = 0.2 it takes less points to reach to global minima.alpha = 0.01 # it takes more points(iterations) to reach to global minima .visualise the respective graph


adv_train <- advertising[sample(seq(1,nrow(advertising)),160),]
adv_testing <- advertising[sample(seq(1,nrow(advertising)),40),]
lm(formula = sales ~ TV , data = adv_train) # m = 0.04579, c = 7.23152
## 
## Call:
## lm(formula = sales ~ TV, data = adv_train)
## 
## Coefficients:
## (Intercept)           TV  
##     7.07521      0.04777
# calculate for gradient descent

m = 1
x=scale(adv_train$TV)
y = adv_train$sales
c1= 10
alpha = 0.01
iterations = 1000
errors = c()
m_vals = c()
c1_vals = c()
for (i in seq(1,iterations)){
  m_vals<-c(m_vals,m)
  c1_vals<-c(c1_vals,c1)
  curr_err <- sum((y - (m * x + c))^2)/length(x)
  errors<-c(errors,curr_err)
  
# for m gradient
df_xy <-df_xy %>% mutate(xy=x*y)
df_xy <-df_xy %>% mutate(mx_sqr = m *(x^2))
df_xy <-df_xy %>% mutate(cx = c*x)
df_xy <-df_xy %>% mutate(xy_mx2_cx = xy - mx_sqr - cx) 
sigma_xy_mx2_cx <-sum(df_xy$xy_mx2_cx)
m_gradient = -2 / length(x)*sigma_xy_mx2_cx

#for c-gradient
df_xy <- df_xy %>% mutate(y_mx_c = y - (m*x)-c)
sigma_y_mx_c <- sum(df_xy$y_mx_c)
c_gradient = -2 /length(x)*sigma_y_mx_c
m <- m - alpha * m_gradient
c <- c - alpha * c_gradient
}
print(m)
## [1] -0.3367424
print(c)
##  [1] -9.999850 -7.777627 -5.555405 -3.333183 -1.110961  1.111262  3.333484
##  [8]  5.555706  7.777928 10.000150